--- title: Data Augmentation for Audio keywords: fastai sidebar: home_sidebar summary: "Transforms to apply data augmentation to AudioSpectrograms and Signals" ---
p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)
x = AudioGetter("", recurse=True, folders=None)
files = x(p)
#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav',
'f0003_us_f0003_00279.wav',
'f0001_us_f0001_00168.wav',
'f0005_us_f0005_00286.wav',]]
audio_orig = AudioItem.create(ex_files[0])
a2s = AudioToSpec(n_fft = 1024, hop_length=256)
sg_orig = a2s(audio_orig)
#sc= single channel, mc = multichannel
def _audio_sc_ex():
return AudioItem.create(ex_files[0])
def _audio_mc_ex():
#get 3 equal length portions of 3 different signals so we can stack them
#for a fake multichannel example
ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);
min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))
return AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))
def _audio_batch_ex(bs):
return AudioItem((torch.stack([AudioItem.create(ex_files[0]).sig for i in range(bs)]), 16000, None))
def _audio_mc_batch_ex(bs):
return AudioItem((torch.stack([_audio_mc_ex().sig for i in range(bs)]), 16000, None))
#sg_multi = a2s(fake_multichannel)
aud_ex = _audio_sc_ex()
aud_mc_ex = _audio_mc_ex()
aud_batch = _audio_batch_ex(4)
aud_mc_batch = _audio_mc_batch_ex(8)
test_eq(type(aud_ex), AudioItem)
test_eq(type(aud_batch), AudioItem)
test_eq(aud_batch.sig.shape, torch.Size([4, 1, 58240]))
test_eq(aud_mc_batch.sig.shape, torch.Size([8, 3, 53760]))
silence_audio = RemoveSilence(threshold=20, pad_ms=20)(audio_orig)
audio_orig.show()
silence_audio.show()
#test that at least a half second of silence is being removed
test(silence_audio.nsamples + 8000, audio_orig.nsamples, operator.le)
#test that nothing is removed from audio that doesnt contain silence
test_aud = AudioItem((torch.rand_like(audio_orig.sig), 16000, None))
print("Random Noise, no silence")
test_aud.hear()
for rm_type in [RemoveType.All, RemoveType.Trim, RemoveType.Split]:
silence_audio_trim = RemoveSilence(rm_type, threshold=20, pad_ms=20)(test_aud)
test_eq(test_aud.nsamples, silence_audio_trim.nsamples)
# trim silence from a multichannel clip, needs more extensive testing
fake_multichannel = _audio_mc_ex()
silence_mc = RemoveSilence(threshold=20, pad_ms=20)(fake_multichannel)
print(silence_mc.sig.shape) #still 3 channels
fake_multichannel.hear()
silence_mc.hear()
silencer = RemoveSilence(threshold=20, pad_ms=20)
%%timeit -n10
silencer(audio_orig)
%%timeit -n10
silencer(fake_multichannel)
#Make sure if old and new sample rates are the same, a new identical AudioItem is returned
no_resample_needed = Resample(audio_orig.sr)(audio_orig)
assert(not no_resample_needed is audio_orig)
test_eq(audio_orig.sr, no_resample_needed.sr)
test_eq(audio_orig.sig, no_resample_needed.sig)
#test and hear realistic sample rates
print("Original, Sample Rate", audio_orig.sr)
audio_orig.hear()
for rate in [4000,8000,22050,44100]:
resampled = Resample(rate)(audio_orig)
orig_samples = audio_orig.nsamples
re_samples = resampled.nsamples
print("Sample Rate", rate)
resampled.hear()
test_eq(re_samples, orig_samples//(audio_orig.sr/rate))
#resample a multichannel audio
resampled = Resample(8000)(fake_multichannel)
test_eq(fake_multichannel.nsamples//2, resampled.nsamples)
test_eq(fake_multichannel.nchannels, resampled.nchannels)
test_eq(resampled.sr, 8000)
for i in range(100):
random_sr = random.randint(16000, 72000)
random_upsample = Resample(random_sr)(audio_orig)
num_samples = random_upsample.nsamples
test_close(num_samples, abs(orig_samples//(audio_orig.sr/random_sr)), eps=1.1)
# Polyphase resampling's speed is dependent on the GCD between old and new rate. For almost all used sample rates it
# will be very fast and much better than any FFT based method. It is slow however in the unlikely event that the
# GCD is small (demonstrated below w GCD of 1 for last 2 examples)
common_downsample = Resample(8000)
slow_downsample = Resample(8001)
slow_upsample = Resample(27101)
%%timeit -n10
common_downsample(audio_orig)
%%timeit -n10
common_downsample(fake_multichannel)
%%timeit -n10
slow_downsample(audio_orig)
%%timeit -n10
slow_upsample(audio_orig)
cropsig_1000ms = CropSignal(1000)
cropsig_2000ms = CropSignal(2000)
cropsig_5000ms = CropSignal(5000, pad_mode=AudioPadType.Zeros_After)
print(f"Audio is {audio_orig.duration} seconds")
aud1s = cropsig_1000ms(audio_orig)
aud2s = cropsig_2000ms(audio_orig)
aud5s = cropsig_5000ms(audio_orig)
audio_orig.show()
aud1s.show()
aud2s.show()
aud5s.show()
test_eq(aud1s.nsamples, 1*audio_orig.sr)
test_eq(aud2s.nsamples, 2*audio_orig.sr)
test_eq(aud5s.nsamples, 5*audio_orig.sr)
test_eq(aud1s.duration, 1)
test_eq(aud2s.duration, 2)
test_eq(aud5s.duration, 5)
mc1s = cropsig_1000ms(fake_multichannel)
mc2s = cropsig_2000ms(fake_multichannel)
mc5s = cropsig_5000ms(fake_multichannel)
test_eq(mc1s.duration, 1)
test_eq(mc2s.duration, 2)
test_eq(mc5s.duration, 5)
# test pad_mode zeros-after
test_aud = AudioItem((torch.rand_like(audio_orig.sig), 16000, None))
cropsig_pad = CropSignal(5000, pad_mode=AudioPadType.Zeros_After)
z_after = cropsig_pad(test_aud)
test_aud.hear()
z_after.hear()
# test end of signal is padded with zeros
test_eq(z_after.sig[:,-10:], torch.zeros_like(z_after.sig)[:,-10:])
# test front of signal is not padded with zeros
test_ne(z_after.sig[:,0:10] , z_after.sig[:,-10:])
# test pad_mode zeros by verifying signal begins and ends with zeros
test_aud.hear()
cropsig_pad = CropSignal(5000)
z_after = cropsig_pad(test_aud)
z_after.hear()
test_eq(z_after.sig[:,0:2], z_after.sig[:,-2:])
# test pad_mode repeat by making sure that columns are equal at the appropriate offsets
cropsig_repeat = CropSignal(12000, pad_mode=AudioPadType.Repeat)
ai_repeat = cropsig_repeat(audio_orig)
ai_repeat.show()
sig_repeat = ai_repeat.sig
for i in range(audio_orig.nsamples):
test_eq(sig_repeat[:,i], sig_repeat[:,i+audio_orig.nsamples])
test_eq(sig_repeat[:,i], sig_repeat[:,i+2*audio_orig.nsamples])
# test bad pad_mode doesnt fail silently
test_fail(CropSignal(12000, pad_mode="tenchify"))
# demonstrate repeat mode works on multichannel data (uncomment to see)
mc_repeat = cropsig_repeat(fake_multichannel)
#mc_repeat.show()
%%timeit -n10
aud1s = cropsig_1000ms(audio_orig)
%%timeit -n10
aud2s = cropsig_2000ms(audio_orig)
%%timeit -n10
aud5s = cropsig_5000ms(audio_orig)
#v1 used scipy.ndimage.interpolation.shift but it was extremely slow (14-16ms) so I rewrote and got it down to 50µs
np.roll(np.array([1,2,3,4,5,6,7]), 2)
# def _shift(sig, s):
# channels, samples = sig.shape[-2:]
# if s == 0: return torch.clone(sig)
# elif s < 0: return torch.cat([sig[...,-1*s:], torch.zeros_like(sig)[...,s:]], dim=-1)
# else : return torch.cat([torch.zeros_like(sig)[...,:s], sig[...,:samples-s]], dim=-1)
# #export
# def ShiftSignal(max_pct=0.2, max_time=None, roll=False):
# def _inner(ai: AudioItem)->AudioItem:
# s = int(random.uniform(-1, 1)*max_pct*ai.nsamples if max_time is None else random.uniform(-1, 1)*max_time*ai.sr)
# sig = torch.from_numpy(np.roll(ai.sig.numpy(), s, axis=1)) if roll else _shift(ai.sig, s)
# return AudioItem((sig, ai.sr, ai.path))
# return _inner
def _shift(sig, s):
samples = sig.shape[-1]
if s == 0: return torch.clone(sig)
elif s < 0: return torch.cat([sig[...,-1*s:], torch.zeros_like(sig)[...,s:]], dim=-1)
else : return torch.cat([torch.zeros_like(sig)[...,:s], sig[...,:samples-s]], dim=-1)
def shift_signal(t:torch.Tensor, shift, roll):
#refactor 2nd half of this statement to just take and roll the final axis
if roll: t[:] = torch.from_numpy(np.roll(t.numpy(), shift, axis=-1))
else : t[:] = _shift(t[:], shift)
return t
t1 = torch.tensor([[1,2,3,4,5,6,7,8,9,10]])
t3 = torch.tensor([[1,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20],[21,22,23,24,25,26,27,28,29,30]])
b4 = torch.stack([t3,t3,t3,t3])
test_eq(b4.shape, torch.Size([4, 3, 10]))
test_eq(_shift(t1,4), tensor([[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]]))
test_eq(_shift(t3,-2), tensor([[3,4,5,6,7,8,9,10,0,0],[13,14,15,16,17,18,19,20,0,0],[23,24,25,26,27,28,29,30,0,0]]))
shift_signal(b4, 4, roll=False)
audio_orig = _audio_sc_ex()
#ipython player normalizes out volume difference, note different y-axis scale but same sound.
shifter = SignalShifter(p=1, max_pct=0.5)
print(shifter)
audio_orig.show()
altered = shifter(audio_orig, split_idx=0)
altered.show()
sg_orig.show()
altered = shifter(sg_orig, split_idx=0)
altered.show()
audio_orig = _audio_batch_ex(8)
shifter = SignalShifter(p=1, max_pct=1)
AudioItem((audio_orig.sig[0], 16000, None)).show()
altered = shifter(audio_orig, split_idx=0)
#AudioItem((audio_orig[0], 16000, None)).show()
print(altered.sig.shape)
for sig in altered.sig:
AudioItem((sig, 16000, None)).show()
audio_orig = _audio_sc_ex()
%%time
altered = shifter(audio_orig, split_idx=0)
audio_orig = _audio_batch_ex(32)
%%time
altered = shifter(audio_orig, split_idx=0)
%%time
altered = shifter(sg_orig, split_idx=0)
audio_orig = _audio_sc_ex()
shifter = SignalShifter(p=1, max_pct=0.5)
shifted = shifter(audio_orig, split_idx=0)
audio_orig.show()
shifted.show()
test_eq(audio_orig.sig.shape, shifted.sig.shape)
# test a time shift of 1s never shifts more than 1s
for i in range(100):
time_shifter = SignalShifter(p=1, max_time=1)
just_ones = AudioItem((torch.ones(16000).unsqueeze(0), 16000, None))
shifted = time_shifter(just_ones, split_idx=0)
test_eq(False, torch.allclose(shifted.sig, torch.zeros(16000)))
# demonstrate shifting works on multichannel data (uncomment to see)
shifter = SignalShifter(p=1, max_time=1)
mc_shifted = shifter(fake_multichannel, split_idx=0)
#mc_shifted.show()
audio_orig = _audio_sc_ex()
audio_orig.show()
shift_and_roll = SignalShifter(p=1, max_pct=0.4, roll=True)
shifted = shift_and_roll(audio_orig, split_idx=0)
shifted.show()
test_eq(audio_orig.sig.shape, shifted.sig.shape)
%%timeit -n10
shifted = shifter(audio_orig, split_idx=0)
%%timeit -n10
shifted = shift_and_roll(audio_orig, split_idx=0)
Adds noise proportional to the energy of the signal (mean of abs value), and the specified noise level.
This uses colorednoise(imported as 'cn'), developed by a data scientist named Felix Patzelt. It allows you to use one simple function to create white, brown, pink and other colors of noise. Each color corresponds to an exponent, violet is -2, blue -1, white is 0, pink is 1, and brown is 2. We abstract this with a class that enumerates the list and shifts it down by two so the exponents are correct, and so that we get tab-completion.
Because this actually draws a spectrogram and does an istft on it, it is about 10x faster if we implement our own white noise (simple and worth doing since it's the most common noise we'll want to use, this is what the if color=0 line does, it overrides and generates white noise using our own simple algo.
For just plain white noise, if we revert to remove the dependency on this library, the noise can be created with
noise = torch.randn_like(ai.sig) * ai.sig.abs().mean() * noise_level
noisy = AddNoise()(audio_orig)
real_noisy = AddNoise(noise_level=0.5)(audio_orig)
msgs = ["Original Audio", "5% White Noise", "50% White Noise"]
for i, aud in enumerate([audio_orig, noisy, real_noisy]):
print(msgs[i])
aud.show()
noisy = AddNoise(color=NoiseColor.Pink)(audio_orig)
real_noisy = AddNoise(noise_level=1, color=NoiseColor.Pink)(audio_orig)
msgs = ["Original Audio", "5% Pink Noise", "100% Pink Noise"]
for i, aud in enumerate([audio_orig, noisy, real_noisy]):
print(msgs[i])
aud.show()
# demonstrate blue-noise on multichannel data (uncomment to see)
noisy = AddNoise(noise_level=0.5, color=NoiseColor.Blue)(fake_multichannel)
#noisy.show()
%%timeit -n10
noise = torch.from_numpy(cn.powerlaw_psd_gaussian(exponent=0, size=audio_orig.nsamples)).float()
scaled_noise = noise * audio_orig.sig.abs().mean() * 0.05
out = AudioItem((audio_orig.sig + scaled_noise,audio_orig.sr, audio_orig.path))
%%timeit -n10
#Same speed for white noise and brown noise using their algorithm
noise = torch.from_numpy(cn.powerlaw_psd_gaussian(exponent=2, size=audio_orig.nsamples)).float()
scaled_noise = noise * audio_orig.sig.abs().mean() * 0.05
out = AudioItem((audio_orig.sig + scaled_noise,audio_orig.sr, audio_orig.path))
%%timeit -n10
noisy = AddNoise()(audio_orig)
audio_orig = AudioItem.create(ex_files[0])
#ipython player normalizes out volume difference, note different y-axis scale but same sound.
volume_adjuster = ChangeVolume(p=1, lower=0.01, upper=0.5)
print(volume_adjuster)
audio_orig.show()
altered = volume_adjuster(audio_orig, split_idx=0)
altered.show()
%%timeit -n10
volume_adjuster(audio_orig, split_idx=0)
%%timeit -n10
volume_adjuster(fake_multichannel, split_idx=0)
audio_orig = AudioItem.create(ex_files[0])
cutter = SignalCutout(p=1, max_cut_pct=0.3)
cut = cutter(audio_orig, split_idx=0)
cut.show()
# demonstrate SignalCutout on multichannel, confirm the cuts align, uncomment to show
cut_mc = SignalCutout(p=1, max_cut_pct=0.5)(fake_multichannel, split_idx=0)
#cut_mc.show()
%%timeit -n10
cutter(audio_orig, split_idx=0)
%%timeit -n10
cutter(fake_multichannel, split_idx=0)
audio_orig = AudioItem.create(ex_files[0])
dropper = SignalLoss(p=1, max_loss_pct=0.5)
dropped = dropper(audio_orig, split_idx=0)
print(f"Percent Dropped: {100*dropper.loss_pct:.2f}")
dropped.show()
# Updating to a RandTransform broke these tests
# verify SignalDrop is dropping both the correct number of samples, and dropping
# the same samples from each channel, over a wide range of cut_pcts
# nsamples = fake_multichannel.nsamples
# for cut_pct in np.linspace(0.05, 0.5, 45):
# dropped_mc = SignalDrop(cut_pct)(fake_multichannel)
# match1 = (dropped_mc.sig[0] == dropped_mc.sig[1]).sum()
# match2 = (dropped_mc.sig[0] == dropped_mc.sig[2]).sum()
# match3 = (dropped_mc.sig[1] == dropped_mc.sig[2]).sum()
# test_close(match1, cut_pct*nsamples, eps=.02*nsamples)
# test_close(match2, cut_pct*nsamples, eps=.02*nsamples)
# test_close(match3, cut_pct*nsamples, eps=.02*nsamples)
%%timeit -n10
dropper(audio_orig, split_idx=0)
%%timeit -n10
dropper(fake_multichannel, split_idx=0)
audio_orig = AudioItem.create(ex_files[0])
downmixed = DownmixMono()(fake_multichannel)
fake_multichannel.show()
downmixed.show()
# test downmixing 1 channel has no effect
downmixer = DownmixMono()
downmixed = downmixer(audio_orig)
test_eq(downmixed.sig, audio_orig.sig)
# example showing a batch of 4 signals
f2 = fake_multichannel.sig.unsqueeze(0)
fake_batch = torch.cat([f2,f2,f2,f2], dim=0)
downmixed = fake_batch.contiguous().mean(-2).unsqueeze(-2)
print("Before shape:", fake_batch.shape)
print("After shape:", downmixed.shape)
%%timeit -n10
downmixer(fake_multichannel)
crop_1000ms = CropTime(1000)
crop_2000ms = CropTime(2000)
crop_5000ms = CropTime(5000)
print(f"Audio is {audio_orig.duration} seconds")
sg_orig = a2s(audio_orig)
s1 = crop_1000ms(sg_orig)
s1.show()
s2 = crop_2000ms(sg_orig)
s2.show()
s5 = crop_5000ms(sg_orig)
s5.show()
test_eq(sg_orig.settings, s1.settings)
test_eq(sg_orig.settings, s5.settings)
test_close(s1.width, int((1/sg_orig.duration)*sg_orig.width), eps=1.01)
test_close(s2.width, int((2/sg_orig.duration)*sg_orig.width), eps=1.01)
test_close(s5.width, int((5/sg_orig.duration)*sg_orig.width), eps=1.01)
# test AudioToSpec->CropTime and CropSignal->AudioToSpec will result in same size images
oa = OpenAudio(files)
crop_dur = random.randint(1000,5000)
pipe_cropsig = Pipeline([oa, AudioToSpec(hop_length=128), CropTime(crop_dur)], as_item=True)
pipe_cropspec = Pipeline([oa, CropSignal(crop_dur), AudioToSpec(hop_length=128), ], as_item=True)
for i in range(50):
test_eq(pipe_cropsig(i).width, pipe_cropspec(i).width)
# test pad_mode zeros-after by verifying sg ends with zeros and begins with non-zeros
crop_5000ms = CropTime(5000, pad_mode=AudioPadType.Zeros_After)
s5 = crop_5000ms(sg_orig)
test_eq(s5[:,:,-1], torch.zeros_like(s5)[:,:,-1])
test_ne(s5[:,:,0], torch.zeros_like(s5)[:,:,-1])
sg_orig.duration
# test pad_mode repeat by making sure that columns are equal at the appropriate offsets
crop_12000ms_repeat = CropTime(12000, pad_mode=AudioPadType.Repeat)
s12_repeat = crop_12000ms_repeat(sg_orig)
s12_repeat.show()
for i in range(sg_orig.width):
test_eq(s12_repeat[:,:,i], s12_repeat[:,:,i+sg_orig.width])
test_eq(s12_repeat[:,:,i], s12_repeat[:,:,i+2*sg_orig.width])
# test bad pad_mode doesnt fail silently, correct is 'zeros_after'
test_fail(CropTime(12000, pad_mode="zerosafter"))
s1.shape, s2.shape, s5.shape
# demonstrate on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
s1_mc = crop_1000ms(sg_multi)
#s1_mc.show()
%%timeit -n10
#1s zero-padded crop
crop_1000ms(sg_orig)
%%timeit -n10
#5s zero-padded crop
crop_5000ms(sg_orig)
%%timeit -n10
#12s repeat-padded crop
crop_12000ms_repeat(sg_orig)
freq_mask = MaskFreq()
%%time
freq_mask(sg_orig).show()
%%time
time_mask = MaskTime()
time_mask(sg_orig).show()
# create a random frequency mask and test that it is being correctly applied
size, start, val = [random.randint(1, 50) for i in range(3)]
freq_mask_test = MaskFreq(size=size, start=start, val=val)
sg_test = freq_mask_test(sg_orig)
sg_test.show()
test_eq(sg_test[:,start:start+size,:], val*torch.ones_like(sg_orig)[:,start:start+size,:])
# create a random time mask and test that it is being correctly applied
size, start, val = [random.randint(1, 50) for i in range(3)]
time_mask_test = MaskTime(size=size, start=start, val=val)
sg_test = time_mask_test(sg_orig)
sg_test.show()
test_eq(sg_test[:,:,start:start+size], val*torch.ones_like(sg_orig)[:,:,start:start+size])
# demonstrate on multichannel audio, uncomment to show, note bar is black so can be hard to see
sg_multi = a2s(fake_multichannel)
masked_mc = MaskFreq(size=40)(sg_multi)
#masked_mc.show()
%%timeit -n10
freq_mask(sg_orig)
%%timeit -n10
# time masking ~80µs slower because we transpose, delegate to MaskFreq, and transpose back, we could
# fix this at the expense of a bit more code
time_mask(sg_orig)
%%timeit -n10
freq_mask(sg_multi)
roller = SGRoll()
sg_orig.show()
roller(sg_orig).show()
roller(sg_orig).show()
#fails occasionally when by chance roll is 0, but i dont want to change to >= or <= because
#it wont detect a broken roll! Could maybe scrap this test, it's overly complex
def _first_non_zero_col(t):
for i in range(t.shape[2]):
if(t[0,0,i].item() == 1): return i
roll_spec = a2s(audio_orig)
mid = int((roll_spec.width/2))-5
test_spec = torch.zeros_like(roll_spec)
test_spec[:,:,mid:mid+10] = 1
roll_spec.data = test_spec
left_roller = SGRoll(max_shift_pct=0.4, direction=-1)
left_spec = left_roller(roll_spec).data
right_roller = SGRoll(max_shift_pct=0.4, direction=1)
right_spec = right_roller(roll_spec).data
ostart, lstart, rstart = map(_first_non_zero_col, (test_spec, left_spec, right_spec))
test(lstart, ostart, operator.lt)
test(rstart, ostart, operator.gt)
# demonstrate rolling on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
rolled_mc = roller(sg_multi)
#rolled_mc.show()
%%timeit -n10
roller(sg_orig)
%%timeit -n10
roller(sg_multi)
delta = Delta()
d = delta(sg_orig)
print("Shape",d.shape)
d.show()
#nchannels for a spectrogram is how many channels its original audio had
test_eq(d.nchannels, audio_orig.nchannels)
test_eq(d.shape[1:], sg_orig.shape[1:])
test_ne(d[0],d[1])
# demonstrate delta on multichannel audio, wont work until sg display is fixed
sg_multi = a2s(fake_multichannel)
delta_mc = delta(sg_multi)
delta_mc.show()
%%timeit -n10
delta(sg_orig)
%%timeit -n10
delta(sg_multi)
# Test when size is an int
size=224
resizer = TfmResize(size)
resized = resizer(sg_orig)
print("Original Shape: ", sg_orig.shape)
print("Resized Shape :" , resized.shape)
test_eq(resized.shape[1:], torch.Size([size,size]))
# Test when size is a tuple with unequal values
size_tup=(124,581)
resizer_tup = TfmResize(size_tup)
resized_tup = resizer_tup(sg_orig)
print("Original Shape: ", sg_orig.shape)
print("Resized Shape :" , resized_tup.shape)
resized_tup.show()
test_eq(resized_tup.shape[1:], torch.Size(size_tup))
# demonstrate resizing on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
resized_mc = TfmResize((200,100))(sg_multi)
#resized_mc.show()
%%timeit -n10
resizer(sg_orig)
%%timeit -n10
resizer(sg_multi)
files
oa = OpenAudio(files); oa
#Show simple preprocessing
preprocess_pipe = Pipeline([oa, RemoveSilence(), CropSignal(2000), Resample(8000)], as_item=True)
for i in range(3): preprocess_pipe(i).show()
#Show a very noisy set of signal augmentations
augment_pipe1 = Pipeline([oa, RemoveSilence(), CropSignal(2000), AddNoise(noise_level=0.3), SignalLoss()], as_item=True)
for i in range(3): augment_pipe1(i).show()
#Show another set of signal augmentations
augment_pipe2 = Pipeline([oa, RemoveSilence(), CropSignal(2000), AddNoise(color=NoiseColor.Blue),
SignalShifter(roll=True), SignalCutout()], as_item=True)
for i in range(3): augment_pipe2(i).show()
#Basic melspectrogram pipe with advanced SpecAugment
sg_cfg = AudioConfig.BasicMelSpectrogram(hop_length=256, n_fft=2048)
pipe = Pipeline([oa, AudioToSpec.from_cfg(sg_cfg), CropTime(2000), MaskTime(num_masks=2, size=4), MaskFreq()], as_item=True)
for i in range(5): pipe.show(pipe(i))
#Pipe with only spectrogram transforms, notably Delta/Accelerate appended
voice_cfg = AudioConfig.Voice()
delta_pipe = Pipeline([oa, AudioToSpec.from_cfg(voice_cfg), CropTime(2000), Delta(), MaskTime(size=4), MaskFreq(), ], as_item=True)
for i in range(5): delta_pipe.show(delta_pipe(i))
for i in range(5): pipe.show(pipe(i))
#Pipe with signal and spectro transforms, and a lot of noise
voice_cfg = AudioConfig.Voice()
everything_pipe = Pipeline([oa,
RemoveSilence(), CropSignal(2000), AddNoise(noise_level=0.3), SignalLoss(),
AudioToSpec.from_cfg(voice_cfg), MaskTime(size=4), MaskFreq(), Delta()], as_item=True)
for i in range(5): everything_pipe.show(everything_pipe(i))